Web Scraping with Python (1)






Chaitanya Tejaswi


21st April, 2020

Objectives

Create eBook from Webpages

Assumptions

Dependencies

Motivation for This Talk

“Can I read this on my Kindle?”

Observations

“Can I read this on my Kindle?”

The Solution: On Amazon Kindle

The Solution: On Android Device

How To Do It?

  1. Send an HTTP request to the server for the file.
  2. Get the file, filter out the “title” & “judgement” (summary).
  3. Save this to a text/html file.
  4. Convert this file to an eBook, particularly one that is compatible with Android & Kindle.

But First, Some Prerequisites

Send A Request, Retrieve A File

from urllib import request
...
response = request.urlopen(url).read().decode('utf-8')

Create An HTML object

from bs4 import BeautifulSoup
...
html = BeautifulSoup(response, 'lxml')

Finding <tags>

# Find headline of text
headline = article.h2.a.text

Syntax

.find(tag, attributes, recursive, text, keywords)
.find_all(tag, attributes, recursive, text, limit, keywords)
# [tag] Find all headings in the page
.find_all('h1')
.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

# [attributes] Find all <span> that contain green/red colored text
.find_all('span', {'class': {'green', 'red'}})

# [text] How many times is "Happy Birthday" displayed on the webpage?
.find_all(text='Happy Birthday')

# [keywords]
.find_all(id='span', class_={'green', 'red'})
Note: OR/AND
# Find all title-summary combinations that are colored in green or red
.find_all('div', id={'title','summary'}, class_={'green', 'red'})

Observations

“Can I read this on my Kindle?”

Source Code: Let’s Jump In!

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import sys
from urllib import request

urlBase = 'https://indiankanoon.org'


def generateHtml(urlId):
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8')
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateHtml(sys.argv[1])

Source Code: Let’s Jump In!

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import sys
from urllib import request

urlBase = 'https://indiankanoon.org'


def generateHtml(urlId):
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8')
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        # [1] <-- Process the links
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
        # [2] <-- Automatically open the file
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateHtml(sys.argv[1])

Source Code: Let’s Jump In!

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request

urlBase = 'https://indiankanoon.org'


def generateHtml(urlId):
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8')
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
                         fr'''\1{urlBase}\2"''', content)
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
        subprocess.run(f'''start {urlId}.html''', shell=True)
        # [3] <-- Save ebook (epub/mobi)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateHtml(sys.argv[1])

Source Code: Let’s Jump In!

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request

urlBase = 'https://indiankanoon.org'


def generateMobi(urlId):
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8')
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
                         fr'''\1{urlBase}\2"''', content)
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
        subprocess.run(f'''pandoc {urlId}.html --epub-cover-image="resources\supreme_court_india.jpg" -o {urlId}.epub''', shell=True)
        subprocess.run(f'''kindlegen {urlId}.epub''', shell=True)
        subprocess.run(f'''start {urlId}.epub''', shell=True)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateMobi(sys.argv[1])

Disclaimer: Don’t Use This In Production Code

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request

urlBase = 'https://indiankanoon.org'


def generateMobi(urlId): # [1]
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8') # [2]
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
                         fr'''\1{urlBase}\2"''', content)
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
        subprocess.run(f'''pandoc {urlId}.html --epub-cover-image="resources\supreme_court_india.jpg" -o {urlId}.epub''', shell=True)
        subprocess.run(f'''kindlegen {urlId}.epub''', shell=True)
        subprocess.run(f'''start {urlId}.epub''', shell=True)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateMobi(sys.argv[1])

Slightly Better

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request, error

urlBase = 'https://indiankanoon.org'


def generateMobi(urlId): # [1]
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8') # [2]
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
                         fr'''\1{urlBase}\2"''', content)
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
        subprocess.run(f'''pandoc {urlId}.html --epub-cover-image="resources\supreme_court_india.jpg" -o {urlId}.epub''', shell=True)
        subprocess.run(f'''kindlegen {urlId}.epub''', shell=True)
        subprocess.run(f'''start {urlId}.epub''', shell=True)
    except error.HTTPError as e:
        print(e)
    except error.URLError as e:
        print(e)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateMobi(sys.argv[1])

Document Your Code (using docstrings)

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request, error

urlBase = 'https://indiankanoon.org'


def generateMobi(urlId=None): # [1]
    '''
    Scrapes & Generates html/epub/mobi versions of document.
    '''
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8') # [2]
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
                         fr'''\1{urlBase}\2"''', content)
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
        subprocess.run(f'''pandoc {urlId}.html --epub-cover-image="resources\supreme_court_india.jpg" -o {urlId}.epub''', shell=True)
        subprocess.run(f'''kindlegen {urlId}.epub''', shell=True)
        subprocess.run(f'''start {urlId}.epub''', shell=True)
    except error.HTTPError as e:
        print(e)
    except error.URLError as e:
        print(e)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateMobi(sys.argv[1])

Use Virtual Environments (venv)

Why Virtual Environments?
To isolate packages used in a project from the packages installed on the system.

Steps
python -m venv myProject\venv
myProject\venv\Scripts\activate.bat
pip list
pip install [package-name]
pip install -r requirements.txt
pip freeze > requirements.txt
deactivate
rmdir /s myProject\venv
Example

Homework

Make Your Own eBook

Scrape the article on this webpage, and create your own ebook using the code used in this talk.

Steps
  1. Visit the Dependencies page of this talk and install all necessary software.
  2. Modify final code to capture the article’s heading & main-content.
  3. Create an HTML & save it locally.
  4. Convert this HTML file to EPUB, and try opening it on your phone using Google’s Play Books app.
Solutions will be posted on Saturday (25-04-2020)

References

[1] “Web Scraping using Python” by Corey Schafer
[2] “RegEx using Python” by Corey Schafer
[3] “Python venv (Windows)” by Corey Schafer
[4] “Web Scraping with Python” by Ryan Mitchell
[5] “Legal Aspects” by Data Carpentry